import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import folktables
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from folktables import ACSDataSource, ACSEmployment, generate_categories, ACSIncome, ACSHealthInsurance
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline, FeatureUnion
import math
import sklearn
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import scipy as sp
from scipy import stats
import rfpimp
import dalex as dx
import warnings
warnings.filterwarnings("ignore")
import xgboost as xg
ACSIncome_categories = {
"COW": {
1.0: (
"Employee of a private for-profit company or"
"business, or of an individual, for wages,"
"salary, or commissions"
),
2.0: (
"Employee of a private not-for-profit, tax-exempt,"
"or charitable organization"
),
3.0: "Local government employee (city, county, etc.)",
4.0: "State government employee",
5.0: "Federal government employee",
6.0: (
"Self-employed in own not incorporated business,"
"professional practice, or farm"
),
7.0: (
"Self-employed in own incorporated business,"
"professional practice or farm"
),
8.0: "Working without pay in family business or farm",
9.0: "Unemployed and last worked 5 years ago or earlier or never worked",
},
"SCHL": {
1.0: "No schooling completed",
2.0: "Nursery school, preschool",
3.0: "Kindergarten",
4.0: "Grade 1",
5.0: "Grade 2",
6.0: "Grade 3",
7.0: "Grade 4",
8.0: "Grade 5",
9.0: "Grade 6",
10.0: "Grade 7",
11.0: "Grade 8",
12.0: "Grade 9",
13.0: "Grade 10",
14.0: "Grade 11",
15.0: "12th grade - no diploma",
16.0: "Regular high school diploma",
17.0: "GED or alternative credential",
18.0: "Some college, but less than 1 year",
19.0: "1 or more years of college credit, no degree",
20.0: "Associate's degree",
21.0: "Bachelor's degree",
22.0: "Master's degree",
23.0: "Professional degree beyond a bachelor's degree",
24.0: "Doctorate degree",
},
"MAR": {
1.0: "Married",
2.0: "Widowed",
3.0: "Divorced",
4.0: "Separated",
5.0: "Never married or under 15 years old",
},
"SEX": {1.0: "Male", 2.0: "Female"},
"RAC1P": {
1.0: "White alone",
2.0: "Black or African American alone",
3.0: "American Indian alone",
4.0: "Alaska Native alone",
5.0: (
"American Indian and Alaska Native tribes specified;"
"or American Indian or Alaska Native,"
"not specified and no other"
),
6.0: "Asian alone",
7.0: "Native Hawaiian and Other Pacific Islander alone",
8.0: "Some Other Race alone",
9.0: "Two or More Races",
},
}
#edited to include WAGP and disclude RELP and POBP
ACSIncome = folktables.BasicProblem(
features=[
'AGEP',
'COW',
'SCHL',
'MAR',
'OCCP',
'WKHP',
'SEX',
'RAC1P',
'WAGP',
'HISP'
],
target='PINCP',
target_transform=lambda x: x > 50000,
group='RAC1P',
preprocess=folktables.adult_filter,
postprocess=lambda x: np.nan_to_num(x, -1),
)
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
ca_data = data_source.get_data(states=["CA"], download=True)
ca_features1, ca_labels1, _ = ACSIncome.df_to_pandas(ca_data, categories=ACSIncome_categories, dummies=False)
ca_features1
| AGEP | COW | SCHL | MAR | OCCP | WKHP | SEX | RAC1P | WAGP | HISP | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 30 | Self-employed in own not incorporated business... | Grade 11 | Married | 9610.0 | 40.0 | Male | Some Other Race alone | 500.0 | 2 |
| 1 | 21 | State government employee | Regular high school diploma | Never married or under 15 years old | 1970.0 | 20.0 | Male | White alone | 7700.0 | 1 |
| 2 | 65 | Employee of a private not-for-profit, tax-exem... | Master's degree | Never married or under 15 years old | 2040.0 | 8.0 | Male | White alone | 5000.0 | 1 |
| 3 | 33 | Employee of a private for-profit company orbus... | Grade 11 | Divorced | 9610.0 | 40.0 | Male | White alone | 12000.0 | 1 |
| 4 | 18 | Employee of a private not-for-profit, tax-exem... | 1 or more years of college credit, no degree | Never married or under 15 years old | 1021.0 | 18.0 | Female | White alone | 300.0 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195660 | 38 | Employee of a private for-profit company orbus... | Master's degree | Married | 1021.0 | 40.0 | Male | Asian alone | 565000.0 | 1 |
| 195661 | 39 | Employee of a private for-profit company orbus... | Master's degree | Married | 1021.0 | 40.0 | Female | Asian alone | 209000.0 | 1 |
| 195662 | 61 | Employee of a private for-profit company orbus... | 1 or more years of college credit, no degree | Married | 5240.0 | 45.0 | Male | White alone | 105000.0 | 1 |
| 195663 | 69 | Self-employed in own incorporated business,pro... | Doctorate degree | Married | 2040.0 | 45.0 | Male | Asian alone | 30000.0 | 1 |
| 195664 | 40 | Employee of a private for-profit company orbus... | GED or alternative credential | Married | 9600.0 | 40.0 | Male | Some Other Race alone | 30000.0 | 2 |
195665 rows × 10 columns
asian=ca_features1[ca_features1['RAC1P']=='Asian alone']
black=ca_features1[ca_features1['RAC1P']=='Black or African American alone']
white=ca_features1[ca_features1['RAC1P']=='White alone']
hisp=ca_features1[ca_features1['RAC1P']=='Some Other Race alone']
plottable=pd.concat([asian, black, white, hisp])
plottable['AGEP'] = plottable['AGEP'].astype(int) #changing strings with numbers to integers
plottable['OCCP'] = plottable['OCCP'].astype(int)
plottable['WKHP'] = plottable['WKHP'].astype(int)
plottable['WAGP'] = plottable['WAGP'].astype(int)
plottable['HISP'] = plottable['HISP'].astype(int)
plottable["MAR"].replace({"Never married or under 15 years old": "Never married"}, inplace=True)
plottable["RAC1P"].replace({"Asian alone": "Asian"}, inplace=True)
plottable["RAC1P"].replace({"Black or African American alone": "Black"}, inplace=True)
plottable["RAC1P"].replace({"White alone": "White"}, inplace=True)
plottable["RAC1P"].replace({"Some Other Race alone": "Other"}, inplace=True)
plottable=plottable.rename(columns={"AGEP": "AGE", "RAC1P": "RACE", "WAGP": "SALARY"})
plottable=plottable[plottable['SALARY']<350000]
for i in range(3,25):
plottable['HISP']=plottable['HISP'].replace(i,2,regex=True)
plottable["SCHL"].replace({"Bachelor\'s degree": "Bachelors degree",
"Regular high school diploma":"High School Diploma or Equivalent",
"GED or alternative credential":"High School Diploma or Equivalent",
"1 or more years of college credit, no degree":"Some College",
"Master's degree":"Graduate degree",
"Associate's degree":"Some College",
"Some college, but less than 1 year":"Some College",
"Professional degree beyond a bachelor\'s degree":"Graduate degree",
"12th grade - no diploma":"Some High School or below",
"Doctorate degree":"Graduate degree",
"No schooling completed" :"Some High School or below",
"Grade 6":"Some High School or below",
"Grade 11":"Some High School or below",
"Grade 9":"Some High School or below",
"Grade 10":"Some High School or below",
"Grade 8":"Some High School or below",
"Grade 3":"Some High School or below",
"Grade 5":"Some High School or below",
"Grade 7":"Some High School or below",
"Grade 4":"Some High School or below",
"Grade 2":"Some High School or below",
"Grade 1":"Some High School or below",
"Nursery school, preschool":"Some High School or below",
"Kindergarten":"Some High School or below"
}, inplace=True)
plottable['COW'].replace({"Employee of a private for-profit company orbusiness, or of an individual, for wages,salary, or commissions":"Private Employee",
"Self-employed in own not incorporated business,professional practice, or farm":"Self_Employed",
"Local government employee (city, county, etc.)":"Government Employee",
"Employee of a private not-for-profit, tax-exempt,or charitable organization":"Private Employee",
"State government employee":"Government Employee",
"Self-employed in own incorporated business,professional practice or farm":"Self_Employed",
"Federal government employee":"Government Employee",
"Working without pay in family business or farm":"Self_Employed"
}, inplace=True)
plottable
| AGE | COW | SCHL | MAR | OCCP | WKHP | SEX | RACE | SALARY | HISP | |
|---|---|---|---|---|---|---|---|---|---|---|
| 27 | 23 | Private Employee | Bachelors degree | Never married | 2545 | 20 | Female | Asian | 4000 | 1 |
| 33 | 18 | Private Employee | High School Diploma or Equivalent | Never married | 9610 | 8 | Female | Asian | 1500 | 1 |
| 49 | 18 | Private Employee | Some College | Never married | 725 | 12 | Female | Asian | 1400 | 1 |
| 53 | 25 | Government Employee | Bachelors degree | Never married | 3870 | 40 | Male | Asian | 13000 | 1 |
| 80 | 20 | Private Employee | High School Diploma or Equivalent | Never married | 725 | 18 | Female | Asian | 650 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195602 | 24 | Private Employee | Bachelors degree | Never married | 2300 | 40 | Female | Other | 20000 | 2 |
| 195628 | 46 | Private Employee | High School Diploma or Equivalent | Married | 8740 | 40 | Female | Other | 12000 | 2 |
| 195629 | 50 | Private Employee | Some High School or below | Married | 7340 | 50 | Male | Other | 17900 | 2 |
| 195642 | 42 | Private Employee | Some High School or below | Married | 6260 | 40 | Male | Other | 61000 | 2 |
| 195664 | 40 | Private Employee | High School Diploma or Equivalent | Married | 9600 | 40 | Male | Other | 30000 | 2 |
182867 rows × 10 columns
plottable.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| AGE | 182867.0 | 42.909169 | 14.899778 | 17.0 | 30.0 | 42.0 | 55.0 | 94.0 |
| OCCP | 182867.0 | 4036.304730 | 2637.664438 | 10.0 | 2016.0 | 4110.0 | 5550.0 | 9830.0 |
| WKHP | 182867.0 | 37.784111 | 12.926377 | 1.0 | 32.0 | 40.0 | 40.0 | 99.0 |
| SALARY | 182867.0 | 50260.777067 | 52382.233154 | 0.0 | 12500.0 | 35000.0 | 70000.0 | 329000.0 |
| HISP | 182867.0 | 1.331192 | 0.470643 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
freq_age=plottable['AGE'].value_counts().sort_index()
freq_age.to_frame()
## x-axis for the plot
x_data = freq_age.index.values
## y-axis as the gaussian
y_data = stats.norm.pdf(freq_age)
## plot data
plt.plot(freq_age)
plt.title('Distribution of Age in the Dataset')
plt.xlabel('Age')
Text(0.5, 0, 'Age')
plt.grid()
plt.hist(plottable['AGE'], bins=78, alpha=0.8)
plt.title('Distribution of Age in the Dataset')
plt.xlabel('Age')
Text(0.5, 0, 'Age')
plt.grid()
plt.hist(plottable['WKHP'], bins=20, alpha=0.8)
plt.title('Distribution of Hours Worked per Week in the Dataset')
plt.xlabel('Hours Worked per Week')
Text(0.5, 0, 'Hours Worked per Week')
plt.grid()
plt.hist(plottable['SALARY'], bins=20, alpha=0.8)
plt.title('Distribution of Salary in the Dataset')
plt.xlabel('Salary')
Text(0.5, 0, 'Salary')
plottable.boxplot(by ='RACE', column =['SALARY'], grid = True)
<AxesSubplot:title={'center':'SALARY'}, xlabel='RACE'>
by_race_gender_mean = plottable.groupby(['RACE','SEX'],as_index=False).SALARY.mean()
by_race_gender_median = plottable.groupby(['RACE','SEX'],as_index=False).SALARY.median()
by_race_gender_mean=by_race_gender_mean.rename(columns={'SALARY':"Mean_Salary"})
by_race_gender_median=by_race_gender_median.rename(columns={'SALARY':"Median_Salary"})
print(by_race_gender_mean,"\n")
print(by_race_gender_median)
RACE SEX Mean_Salary
0 Asian Female 52864.154185
1 Asian Male 67356.512708
2 Black Female 40330.507380
3 Black Male 44765.347056
4 Other Female 27031.443645
5 Other Male 35275.147723
6 White Female 43438.414165
7 White Male 59038.678868
RACE SEX Median_Salary
0 Asian Female 39000.0
1 Asian Male 50000.0
2 Black Female 30000.0
3 Black Male 31100.0
4 Other Female 21400.0
5 Other Male 30000.0
6 White Female 30000.0
7 White Male 40900.0
fig, ax = plt.subplots(figsize=(12, 8))
x = np.arange(len(by_race_gender_mean.RACE.unique()))
# Define bar width. We'll use this to offset the second bar.
# Note we add the `width` parameter now which sets the width of each bar.
b1 = ax.bar(x, by_race_gender_mean.loc[by_race_gender_mean['SEX'] == 'Male', 'Mean_Salary'], width=0.4, label='Male')
# Same thing, but offset the x by the width of the bar.
b2 = ax.bar(x + 0.4, by_race_gender_mean.loc[by_race_gender_mean['SEX'] == 'Female', 'Mean_Salary'], width=0.4, label='Female')
# Fix the x-axes.
ax.set_xticks(x + 0.2)
ax.set_xticklabels(by_race_gender_mean.RACE.unique())
ax.set_ylabel("Salary in $")
ax.bar_label(b1, padding=3)
ax.bar_label(b2, padding=3)
ax.set_title("Mean Salary by Sex and Race")
ax.legend()
<matplotlib.legend.Legend at 0x293fc20bb20>
fig, axx = plt.subplots(figsize=(12, 8))
x = np.arange(len(by_race_gender_median.RACE.unique()))
# Define bar width. We'll use this to offset the second bar.
bar_width = 0.4
# Note we add the `width` parameter now which sets the width of each bar.
b1 = axx.bar(x, by_race_gender_median.loc[by_race_gender_median['SEX'] == 'Male', 'Median_Salary'],
width=bar_width, label='Male')
# Same thing, but offset the x by the width of the bar.
b2 = axx.bar(x + bar_width, by_race_gender_median.loc[by_race_gender_median['SEX'] == 'Female', 'Median_Salary'],
width=bar_width, label="Female")
# Fix the x-axes.
axx.set_xticks(x + bar_width / 2)
axx.set_xticklabels(by_race_gender_median.RACE.unique())
axx.bar_label(b1, padding=3)
axx.bar_label(b2, padding=3)
axx.set_ylabel("Salary in $")
axx.set_title("Median Salary by Sex and Race")
axx.legend()
<matplotlib.legend.Legend at 0x293fc286590>
import seaborn as sns
sns.countplot(x="RACE", hue="HISP", data=plottable);
plt.title('Hispanic population by race');
plt.legend(bbox_to_anchor=(.68, 1.0), loc='upper left', labels = ['Non-Hispanic','Hispanic'])
<matplotlib.legend.Legend at 0x293fc229a20>
salary_fit = pd.cut(plottable.SALARY, bins = [0,plottable['SALARY'].median(),350001], labels= [True, False])
plottable.insert(10, 'Salary_below_median', salary_fit)
plottable
| AGE | COW | SCHL | MAR | OCCP | WKHP | SEX | RACE | SALARY | HISP | Salary_below_median | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 27 | 23 | Private Employee | Bachelors degree | Never married | 2545 | 20 | Female | Asian | 4000 | 1 | True |
| 33 | 18 | Private Employee | High School Diploma or Equivalent | Never married | 9610 | 8 | Female | Asian | 1500 | 1 | True |
| 49 | 18 | Private Employee | Some College | Never married | 725 | 12 | Female | Asian | 1400 | 1 | True |
| 53 | 25 | Government Employee | Bachelors degree | Never married | 3870 | 40 | Male | Asian | 13000 | 1 | True |
| 80 | 20 | Private Employee | High School Diploma or Equivalent | Never married | 725 | 18 | Female | Asian | 650 | 1 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195602 | 24 | Private Employee | Bachelors degree | Never married | 2300 | 40 | Female | Other | 20000 | 2 | True |
| 195628 | 46 | Private Employee | High School Diploma or Equivalent | Married | 8740 | 40 | Female | Other | 12000 | 2 | True |
| 195629 | 50 | Private Employee | Some High School or below | Married | 7340 | 50 | Male | Other | 17900 | 2 | True |
| 195642 | 42 | Private Employee | Some High School or below | Married | 6260 | 40 | Male | Other | 61000 | 2 | False |
| 195664 | 40 | Private Employee | High School Diploma or Equivalent | Married | 9600 | 40 | Male | Other | 30000 | 2 | True |
182867 rows × 11 columns
ACSIncome_categories_1 = {
"COW": {"Private Employee":1,
"Government Employee":2,
"Self_Employed":3},
"SCHL": {"Some High School or below":1,
"High School Diploma or Equivalent":2,
"Some College":3,
"Bachelors degree":4,
"Graduate degree":5},
"MAR": { "Married":1,
"Widowed":2,
"Divorced":3,
"Separated":4,
"Never married":5},
"SEX": { "Male":1,
"Female":2},
"RACE": {"White":1,
"Black":2,
"Asian":3,
"Other":4,
},
}
obj_df = plottable.replace(ACSIncome_categories_1)
obj_df
| AGE | COW | SCHL | MAR | OCCP | WKHP | SEX | RACE | SALARY | HISP | Salary_below_median | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 27 | 23 | 1 | 4 | 5 | 2545 | 20 | 2 | 3 | 4000 | 1 | True |
| 33 | 18 | 1 | 2 | 5 | 9610 | 8 | 2 | 3 | 1500 | 1 | True |
| 49 | 18 | 1 | 3 | 5 | 725 | 12 | 2 | 3 | 1400 | 1 | True |
| 53 | 25 | 2 | 4 | 5 | 3870 | 40 | 1 | 3 | 13000 | 1 | True |
| 80 | 20 | 1 | 2 | 5 | 725 | 18 | 2 | 3 | 650 | 1 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195602 | 24 | 1 | 4 | 5 | 2300 | 40 | 2 | 4 | 20000 | 2 | True |
| 195628 | 46 | 1 | 2 | 1 | 8740 | 40 | 2 | 4 | 12000 | 2 | True |
| 195629 | 50 | 1 | 1 | 1 | 7340 | 50 | 1 | 4 | 17900 | 2 | True |
| 195642 | 42 | 1 | 1 | 1 | 6260 | 40 | 1 | 4 | 61000 | 2 | False |
| 195664 | 40 | 1 | 2 | 1 | 9600 | 40 | 1 | 4 | 30000 | 2 | True |
182867 rows × 11 columns
obj_df = obj_df[obj_df['Salary_below_median'].notna()]
#obj_df['Salary_below_median']=obj_df['Salary_below_median'].dropna()
obj_df['Salary_below_median'].isna().sum()
0
obj_df
| AGE | COW | SCHL | MAR | OCCP | WKHP | SEX | RACE | SALARY | HISP | Salary_below_median | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 27 | 23 | 1 | 4 | 5 | 2545 | 20 | 2 | 3 | 4000 | 1 | True |
| 33 | 18 | 1 | 2 | 5 | 9610 | 8 | 2 | 3 | 1500 | 1 | True |
| 49 | 18 | 1 | 3 | 5 | 725 | 12 | 2 | 3 | 1400 | 1 | True |
| 53 | 25 | 2 | 4 | 5 | 3870 | 40 | 1 | 3 | 13000 | 1 | True |
| 80 | 20 | 1 | 2 | 5 | 725 | 18 | 2 | 3 | 650 | 1 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195602 | 24 | 1 | 4 | 5 | 2300 | 40 | 2 | 4 | 20000 | 2 | True |
| 195628 | 46 | 1 | 2 | 1 | 8740 | 40 | 2 | 4 | 12000 | 2 | True |
| 195629 | 50 | 1 | 1 | 1 | 7340 | 50 | 1 | 4 | 17900 | 2 | True |
| 195642 | 42 | 1 | 1 | 1 | 6260 | 40 | 1 | 4 | 61000 | 2 | False |
| 195664 | 40 | 1 | 2 | 1 | 9600 | 40 | 1 | 4 | 30000 | 2 | True |
168541 rows × 11 columns
train,test=obj_df.drop(['SALARY','Salary_below_median'],axis=1), obj_df['Salary_below_median']
X_train, X_test, y_train, y_test = train_test_split(train,test, test_size=0.2, random_state=np.random.seed())
y_test=y_test.astype('int')
y_train=y_train.astype('int')
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
dtc_pred=dtc.predict(X_test)
clf = SGDClassifier()
# fit (train) the classifier
clf.fit(X_train, y_train)
clf_pred=clf.predict(X_test)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
gbc = GradientBoostingClassifier(n_estimators=100)
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_test)
xgb = xg.XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
clf_acc = accuracy_score(y_test, clf_pred)
print("Linear Accuracy: " + str(clf_acc))
rfc_acc = accuracy_score(y_test, rfc_pred)
print("Random Forest Accuracy: " + str(rfc_acc))
gbc_acc = accuracy_score(y_test, gbc_pred)
print("Gradient Boosting Accuracy: " + str(gbc_acc))
dtc_acc = accuracy_score(y_test, dtc_pred)
print("Decision Tree Accuracy: "+ str(dtc_acc))
xgb_acc = accuracy_score(y_test, xgb_pred)
print("XGBoost Accuracy: " + str(xgb_acc))
Linear Accuracy: 0.7598267524993325 Random Forest Accuracy: 0.8059865317867632 Gradient Boosting Accuracy: 0.8150938918389747 Decision Tree Accuracy: 0.7603310688540152 XGBoost Accuracy: 0.8237562668723486
imp_clf = rfpimp.importances(clf, X_test, y_test)
imp_clf
| Importance | |
|---|---|
| Feature | |
| WKHP | 0.1038 |
| MAR | 0.0390 |
| SCHL | 0.0342 |
| OCCP | 0.0232 |
| HISP | 0.0090 |
| RACE | 0.0070 |
| SEX | 0.0036 |
| AGE | 0.0010 |
| COW | -0.0018 |
imp_rfc = rfpimp.importances(rfc, X_test, y_test)
imp_rfc
| Importance | |
|---|---|
| Feature | |
| WKHP | 0.1268 |
| OCCP | 0.0744 |
| AGE | 0.0456 |
| SCHL | 0.0282 |
| SEX | 0.0150 |
| HISP | 0.0086 |
| COW | 0.0078 |
| RACE | 0.0062 |
| MAR | 0.0044 |
imp_gbc = rfpimp.importances(gbc, X_test, y_test)
imp_gbc
| Importance | |
|---|---|
| Feature | |
| WKHP | 0.1260 |
| AGE | 0.0570 |
| OCCP | 0.0444 |
| SCHL | 0.0344 |
| SEX | 0.0090 |
| MAR | 0.0054 |
| HISP | 0.0044 |
| COW | 0.0028 |
| RACE | 0.0014 |
exp_lin = dx.Explainer(clf, X_test, y_test, verbose=False)
exp_rfg = dx.Explainer(rfc, X_test, y_test, verbose=False)
exp_gbc = dx.Explainer(gbc, X_test, y_test, verbose=False)
exp_dtc = dx.Explainer(dtc, X_test, y_test, verbose=False)
exp_xgb = dx.Explainer(xgb, X_test, y_test, verbose=False)
exp_lin.model_performance().result.append(exp_rfg.model_performance().result).append(exp_gbc.model_performance().result).append(exp_dtc.model_performance().result).append(exp_xgb.model_performance().result)
| recall | precision | f1 | accuracy | auc | |
|---|---|---|---|---|---|
| SGDClassifier | 0.660011 | 0.792164 | 0.720075 | 0.759827 | 0.753829 |
| RandomForestClassifier | 0.774102 | 0.803645 | 0.788597 | 0.805749 | 0.882478 |
| GradientBoostingClassifier | 0.765988 | 0.826268 | 0.794987 | 0.815094 | 0.894318 |
| DecisionTreeClassifier | 0.753058 | 0.734256 | 0.743538 | 0.756860 | 0.763279 |
| XGBClassifier | 0.783672 | 0.830244 | 0.806286 | 0.823756 | 0.904609 |
ACSIncome_categories = {
"COW": {
1.0: (
"Employee of a private for-profit company or"
"business, or of an individual, for wages,"
"salary, or commissions"
),
2.0: (
"Employee of a private not-for-profit, tax-exempt,"
"or charitable organization"
),
3.0: "Local government employee (city, county, etc.)",
4.0: "State government employee",
5.0: "Federal government employee",
6.0: (
"Self-employed in own not incorporated business,"
"professional practice, or farm"
),
7.0: (
"Self-employed in own incorporated business,"
"professional practice or farm"
),
8.0: "Working without pay in family business or farm",
9.0: "Unemployed and last worked 5 years ago or earlier or never worked",
},
"SCHL": {
1.0: "No schooling completed",
2.0: "Nursery school, preschool",
3.0: "Kindergarten",
4.0: "Grade 1",
5.0: "Grade 2",
6.0: "Grade 3",
7.0: "Grade 4",
8.0: "Grade 5",
9.0: "Grade 6",
10.0: "Grade 7",
11.0: "Grade 8",
12.0: "Grade 9",
13.0: "Grade 10",
14.0: "Grade 11",
15.0: "12th grade - no diploma",
16.0: "Regular high school diploma",
17.0: "GED or alternative credential",
18.0: "Some college, but less than 1 year",
19.0: "1 or more years of college credit, no degree",
20.0: "Associate's degree",
21.0: "Bachelor's degree",
22.0: "Master's degree",
23.0: "Professional degree beyond a bachelor's degree",
24.0: "Doctorate degree",
},
"MAR": {
1.0: "Married",
2.0: "Widowed",
3.0: "Divorced",
4.0: "Separated",
5.0: "Never married or under 15 years old",
},
"SEX": {1.0: "Male", 2.0: "Female"},
"RAC1P": {
1.0: "White alone",
2.0: "Black or African American alone",
3.0: "American Indian alone",
4.0: "Alaska Native alone",
5.0: (
"American Indian and Alaska Native tribes specified;"
"or American Indian or Alaska Native,"
"not specified and no other"
),
6.0: "Asian alone",
7.0: "Native Hawaiian and Other Pacific Islander alone",
8.0: "Some Other Race alone",
9.0: "Two or More Races",
},
}
ACSIncome = folktables.BasicProblem(
features=[
'AGEP',
'COW',
'SCHL',
'MAR',
'OCCP',
'WKHP',
'SEX',
'RAC1P',
'WAGP',
'HISP'
],
target='PINCP',
target_transform=lambda x: x > 50000,
group='RAC1P',
preprocess=folktables.adult_filter,
postprocess=lambda x: np.nan_to_num(x, -1),
)
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
ca_data = data_source.get_data(states=["CA"], download=True)
ca_features, ca_labels, _ = ACSIncome.df_to_pandas(ca_data, categories=ACSIncome_categories, dummies=True)
ca_features
| AGEP | OCCP | WKHP | WAGP | HISP | COW_Employee of a private for-profit company orbusiness, or of an individual, for wages,salary, or commissions | COW_Employee of a private not-for-profit, tax-exempt,or charitable organization | COW_Federal government employee | COW_Local government employee (city, county, etc.) | COW_Self-employed in own incorporated business,professional practice or farm | ... | SEX_Male | RAC1P_Alaska Native alone | RAC1P_American Indian alone | RAC1P_American Indian and Alaska Native tribes specified;or American Indian or Alaska Native,not specified and no other | RAC1P_Asian alone | RAC1P_Black or African American alone | RAC1P_Native Hawaiian and Other Pacific Islander alone | RAC1P_Some Other Race alone | RAC1P_Two or More Races | RAC1P_White alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 30.0 | 9610.0 | 40.0 | 500.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1 | 21.0 | 1970.0 | 20.0 | 7700.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 2 | 65.0 | 2040.0 | 8.0 | 5000.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 3 | 33.0 | 9610.0 | 40.0 | 12000.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 4 | 18.0 | 1021.0 | 18.0 | 300.0 | 7.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195660 | 38.0 | 1021.0 | 40.0 | 565000.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 195661 | 39.0 | 1021.0 | 40.0 | 209000.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 195662 | 61.0 | 5240.0 | 45.0 | 105000.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 195663 | 69.0 | 2040.0 | 45.0 | 30000.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 195664 | 40.0 | 9600.0 | 40.0 | 30000.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
195665 rows × 53 columns
asian=ca_features[ca_features['RAC1P_Asian alone']==1]
black=ca_features[ca_features['RAC1P_Black or African American alone']==1]
white=ca_features[ca_features['RAC1P_White alone']==1]
other=ca_features[ca_features['RAC1P_Some Other Race alone']==1]
plottable1=pd.concat([asian, black, white, other])
plottable1['AGEP'] = plottable1['AGEP'].astype(int) #changing strings with numbers to integers
plottable1['OCCP'] = plottable1['OCCP'].astype(int)
plottable1['WKHP'] = plottable1['WKHP'].astype(int)
plottable1['WAGP'] = plottable1['WAGP'].astype(int)
plottable1['HISP'] = plottable1['HISP'].astype(int)
#plottable["MAR"].replace({"Never married or under 15 years old": "Never married"}, inplace=True)
plottable1=plottable1.rename({"WKHP":"Weekly_Hours_Worked","WAGP":"SALARY","AGEP":"AGE","OCCP":"OCCUPATION","MAR_Never married or under 15 years old": "MAR_Never married","RAC1P_Asian alone": "RACE_Asian", "RAC1P_Black or African American alone": "RACE_Black","RAC1P_White alone": "RACE_White", "RAC1P_Some Other Race alone": "RACE_Other"}, axis=1)
plottable1=plottable1[plottable1['SALARY']<350000]
plottable1=plottable1.drop(['RAC1P_Two or More Races','RAC1P_Native Hawaiian and Other Pacific Islander alone','RAC1P_Alaska Native alone','RAC1P_American Indian alone','RAC1P_American Indian and Alaska Native tribes specified;or American Indian or Alaska Native,not specified and no other'],axis=1)
plottable1['SCHL_Some High School or below']=plottable1['SCHL_12th grade - no diploma']+plottable1['SCHL_Grade 11']+plottable1['SCHL_Grade 10']+plottable1['SCHL_Grade 9']+plottable1['SCHL_Grade 8']+plottable1['SCHL_Grade 7']+plottable1['SCHL_Grade 6']+plottable1['SCHL_Grade 5']+plottable1['SCHL_Grade 4']+plottable1['SCHL_Kindergarten']+plottable1['SCHL_Grade 3']+plottable1['SCHL_Grade 2']+plottable1['SCHL_Grade 1']+plottable1['SCHL_No schooling completed']+plottable1['SCHL_Nursery school, preschool']
plottable1=plottable1.drop(['SCHL_12th grade - no diploma','SCHL_Grade 11','SCHL_Grade 10','SCHL_Grade 9','SCHL_Grade 8','SCHL_Grade 7','SCHL_Grade 6','SCHL_Grade 5','SCHL_Grade 4','SCHL_Grade 3','SCHL_Grade 2','SCHL_Grade 1','SCHL_No schooling completed','SCHL_Nursery school, preschool','SCHL_Kindergarten'],axis=1)
plottable1['SCHL_High School Diploma or Equivalent']=plottable1['SCHL_Regular high school diploma']+plottable1['SCHL_GED or alternative credential']
plottable1=plottable1.drop(['SCHL_Regular high school diploma','SCHL_GED or alternative credential'],axis=1)
plottable1['SCHL_Some College']=plottable1['SCHL_1 or more years of college credit, no degree']+plottable1['SCHL_Associate\'s degree']+plottable1['SCHL_Some college, but less than 1 year']
plottable1=plottable1.drop(['SCHL_1 or more years of college credit, no degree','SCHL_Associate\'s degree','SCHL_Some college, but less than 1 year'],axis=1)
plottable1['SCHL_Bachelors degree']=plottable1['SCHL_Bachelor\'s degree']
plottable1['SCHL_Graduate degree']=plottable1['SCHL_Professional degree beyond a bachelor\'s degree']+plottable1['SCHL_Master\'s degree']+plottable1['SCHL_Doctorate degree']
plottable1=plottable1.drop(['SCHL_Bachelor\'s degree','SCHL_Professional degree beyond a bachelor\'s degree','SCHL_Master\'s degree','SCHL_Doctorate degree'],axis=1)
plottable1['COW_Private Employee']=plottable1['COW_Employee of a private not-for-profit, tax-exempt,or charitable organization']+plottable1[ 'COW_Employee of a private for-profit company orbusiness, or of an individual, for wages,salary, or commissions']
#plottable=plottable.drop([ 'COW_Employee of a private for-profit company orbusiness, or of an individual, for wages,salary, or commissions','COW_Employee of a private not-for-profit, tax-exempt,or charitable organization'],axis=1)
plottable1['COW_Government Employee']=plottable1['COW_Federal government employee']+plottable1['COW_Local government employee (city, county, etc.)']+plottable1['COW_State government employee']
plottable1['COW_Self_Employed']=plottable1['COW_Working without pay in family business or farm']+plottable1['COW_Self-employed in own not incorporated business,professional practice, or farm']+plottable1['COW_Self-employed in own not incorporated business,professional practice, or farm']
plottable1=plottable1.drop(['COW_Working without pay in family business or farm','COW_Employee of a private for-profit company orbusiness, or of an individual, for wages,salary, or commissions','COW_Employee of a private not-for-profit, tax-exempt,or charitable organization','COW_Federal government employee','COW_Local government employee (city, county, etc.)','COW_Self-employed in own incorporated business,professional practice or farm', 'COW_Self-employed in own not incorporated business,professional practice, or farm', 'COW_State government employee'],axis=1)
for i in range(3,25):
plottable1['HISP']=plottable1['HISP'].replace(i,2,regex=True)
plottable1
| AGE | OCCUPATION | Weekly_Hours_Worked | SALARY | HISP | MAR_Divorced | MAR_Married | MAR_Never married | MAR_Separated | MAR_Widowed | ... | RACE_Other | RACE_White | SCHL_Some High School or below | SCHL_High School Diploma or Equivalent | SCHL_Some College | SCHL_Bachelors degree | SCHL_Graduate degree | COW_Private Employee | COW_Government Employee | COW_Self_Employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 27 | 23 | 2545 | 20 | 4000 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 33 | 18 | 9610 | 8 | 1500 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 49 | 18 | 725 | 12 | 1400 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 53 | 25 | 3870 | 40 | 13000 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 80 | 20 | 725 | 18 | 650 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195602 | 24 | 2300 | 40 | 20000 | 2 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 195628 | 46 | 8740 | 40 | 12000 | 2 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 195629 | 50 | 7340 | 50 | 17900 | 2 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 195642 | 42 | 6260 | 40 | 61000 | 2 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 195664 | 40 | 9600 | 40 | 30000 | 2 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
182867 rows × 24 columns
salary_fit = pd.cut(plottable1.SALARY, bins = [0,plottable1['SALARY'].median(),350001], labels= [True, False])
plottable1.insert(10, 'Salary_below_median', salary_fit)
plottable1
| AGE | OCCUPATION | Weekly_Hours_Worked | SALARY | HISP | MAR_Divorced | MAR_Married | MAR_Never married | MAR_Separated | MAR_Widowed | ... | RACE_Other | RACE_White | SCHL_Some High School or below | SCHL_High School Diploma or Equivalent | SCHL_Some College | SCHL_Bachelors degree | SCHL_Graduate degree | COW_Private Employee | COW_Government Employee | COW_Self_Employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 27 | 23 | 2545 | 20 | 4000 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 33 | 18 | 9610 | 8 | 1500 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 49 | 18 | 725 | 12 | 1400 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 53 | 25 | 3870 | 40 | 13000 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 80 | 20 | 725 | 18 | 650 | 1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 195602 | 24 | 2300 | 40 | 20000 | 2 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 195628 | 46 | 8740 | 40 | 12000 | 2 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 195629 | 50 | 7340 | 50 | 17900 | 2 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 195642 | 42 | 6260 | 40 | 61000 | 2 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 195664 | 40 | 9600 | 40 | 30000 | 2 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
182867 rows × 25 columns
plottable1 = plottable1[plottable1['Salary_below_median'].notna()]
train1,test1=plottable1.drop(['SALARY','Salary_below_median'],axis=1), plottable1['Salary_below_median']
#X_train, X_test, y_train, y_test = train_test_split(train1,test1, test_size=0.2, random_state=np.random.seed())
X_train1, X_test1, y_train1, y_test1 = train_test_split(train1, test1, test_size=0.2, random_state=np.random.seed())
y_train1=y_train1.astype(int)
y_test1=y_test1.astype(int)
dtc_one = DecisionTreeClassifier()
dtc_one.fit(X_train1,y_train1)
dtc_one_pred=dtc_one.predict(X_test1)
clf_one = SGDClassifier()
clf_one.fit(X_train1, y_train1)
clf_one_pred=clf_one.predict(X_test1)
rfc_one = RandomForestClassifier(n_estimators=100)
rfc_one.fit(X_train1, y_train1)
rfc_one_pred = rfc_one.predict(X_test1)
gbc_one = GradientBoostingClassifier(n_estimators=100)
gbc_one.fit(X_train1, y_train1)
gbc_one_pred = gbc_one.predict(X_test1)
xgb_one = xg.XGBClassifier()
xgb_one.fit(X_train1, y_train1)
xgb_one_pred = xgb_one.predict(X_test1)
clf_one_acc = accuracy_score(y_test1, clf_one_pred)
print("Linear Accuracy: " + str(clf_one_acc))
rfc_one_acc = accuracy_score(y_test1, rfc_one_pred)
print("Random Forest Accuracy: " + str(rfc_one_acc))
gbc_one_acc = accuracy_score(y_test1, gbc_one_pred)
print("Gradient Boosting Accuracy: " + str(gbc_one_acc))
dtc_one_acc = accuracy_score(y_test1, dtc_one_pred)
print("Decision Tree Accuracy: " + str(dtc_one_acc))
xgb_one_acc = accuracy_score(y_test1, xgb_one_pred)
print("XGBoost Accuracy: " + str(xgb_one_acc))
Linear Accuracy: 0.6527930226349046 Random Forest Accuracy: 0.8029013023228218 Gradient Boosting Accuracy: 0.8155982081936575 Decision Tree Accuracy: 0.7627933192915839 XGBoost Accuracy: 0.827998457385268
exp_clf1 = dx.Explainer(clf_one, X_test1, y_test1, verbose=False)
exp_rfc1 = dx.Explainer(rfc_one, X_test1, y_test1, verbose=False)
exp_gbc1 = dx.Explainer(gbc_one, X_test1, y_test1, verbose=False)
exp_dtc1 = dx.Explainer(dtc_one, X_test1, y_test1, verbose=False)
exp_xgb1 = dx.Explainer(xgb_one, X_test1, y_test1, verbose=False)
exp_clf1.model_performance().result.append(exp_rfc1.model_performance().result).append(exp_gbc1.model_performance().result).append(exp_dtc1.model_performance().result).append(exp_xgb1.model_performance().result)
| recall | precision | f1 | accuracy | auc | |
|---|---|---|---|---|---|
| SGDClassifier | 0.939961 | 0.579525 | 0.716994 | 0.652793 | 0.670109 |
| RandomForestClassifier | 0.774361 | 0.797675 | 0.785845 | 0.802516 | 0.878851 |
| GradientBoostingClassifier | 0.766310 | 0.826914 | 0.795459 | 0.815598 | 0.895566 |
| DecisionTreeClassifier | 0.752045 | 0.741236 | 0.746601 | 0.761132 | 0.766918 |
| XGBClassifier | 0.787675 | 0.835339 | 0.810807 | 0.827998 | 0.907197 |
plottable = plottable[plottable['Salary_below_median'].notna()]
X = plottable.drop(columns=['SALARY','Salary_below_median'], axis=1)
y = plottable.Salary_below_median
X_train_fair,X_test_fair,y_train_fair,y_test_fair,race_train,race_test=train_test_split(X, y,X['RACE'], test_size=0.2, random_state=np.random.seed())
categorical_features = ['SEX', 'COW', 'MAR', 'RACE', "SCHL"]
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_features=['AGE', 'OCCP','WKHP','HISP']
numerical_transformer = Pipeline(steps=[
('scale', StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
('cat', categorical_transformer, categorical_features),
('num', numerical_transformer, numerical_features)
])
classifier = RandomForestClassifier(max_depth=10,n_estimators=20,random_state=np.random.seed())
clf = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', classifier)
])
y_train_fair=y_train_fair.astype('int')
y_test_fair=y_test_fair.astype('int')
clf.fit(X_train_fair, y_train_fair)
unaware_y_preds=clf.predict(X_test_fair)
exp = dx.Explainer(clf, X_test_fair, y_test_fair, label='Random Forest Bias Unaware', verbose=True)
exp.model_performance()
Preparation of a new explainer is initiated -> data : 33709 rows 9 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 33709 values -> model_class : sklearn.ensemble._forest.RandomForestClassifier (default) -> label : Random Forest Bias Unaware -> predict function : <function yhat_proba_default at 0x00000293D51C3370> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.0369, mean = 0.472, max = 0.995 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.98, mean = 0.00326, max = 0.957 -> model_info : package sklearn A new explainer has been created!
| recall | precision | f1 | accuracy | auc | |
|---|---|---|---|---|---|
| Random Forest Bias Unaware | 0.766958 | 0.819584 | 0.792398 | 0.809131 | 0.889062 |
exp.model_parts().plot()
fair_tree=exp.model_fairness(protected=race_test, privileged="White")
fair_tree.metric_scores
| TPR | TNR | PPV | NPV | FNR | FPR | FDR | FOR | ACC | STP | |
|---|---|---|---|---|---|---|---|---|---|---|
| Asian | 0.674 | 0.905 | 0.826 | 0.806 | 0.326 | 0.095 | 0.174 | 0.194 | 0.812 | 0.327 |
| Black | 0.733 | 0.840 | 0.837 | 0.736 | 0.267 | 0.160 | 0.163 | 0.264 | 0.783 | 0.464 |
| Other | 0.875 | 0.595 | 0.798 | 0.723 | 0.125 | 0.405 | 0.202 | 0.277 | 0.776 | 0.709 |
| White | 0.763 | 0.862 | 0.824 | 0.811 | 0.237 | 0.138 | 0.176 | 0.189 | 0.816 | 0.425 |
y_test_fair.groupby(race_test).mean()
RACE Asian 0.400426 Black 0.529595 Other 0.646511 White 0.458866 Name: Salary_below_median, dtype: float64
pd.Series(unaware_y_preds,index=y_test_fair.index).groupby(race_test).mean()
RACE Asian 0.326996 Black 0.463551 Other 0.709130 White 0.425134 dtype: float64
fair_tree.fairness_check()
Bias detected in 2 metrics: FPR, STP
Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.
Ratios of metrics, based on 'White'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
TPR ACC PPV FPR STP
Asian 0.883355 0.995098 1.002427 0.688406 0.769412
Black 0.960682 0.959559 1.015777 1.159420 1.091765
Other 1.146789 0.950980 0.968447 2.934783 1.668235
fair_tree.plot()
from sklearn.preprocessing import PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
class NormalizeColumnByLabel(BaseEstimator, TransformerMixin):
def __init__(self,col,label):
self.col=col
self.label=label
self.transformers = {}
def fit(self,X,y=None):
for group in X[self.label].unique():
self.transformers[group] = PowerTransformer( method='yeo-johnson',standardize=True)
self.transformers[group].fit(
X.loc[X[self.label]==group][self.col].values.reshape(-1,1))
return self
def transform(self,X,y=None):
C=X.copy()
for group in X[self.label].unique():
C.loc[X[self.label]==group, self.col] = self.transformers[group].transform(X.loc[X[self.label]==group][self.col].values.reshape(-1,1))
return C
n=NormalizeColumnByLabel(col='WKHP',label='RACE')
X_train_norm=n.fit_transform(X_train_fair, y_train_fair)
X_train_norm.groupby('RACE')['WKHP'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RACE | ||||||||
| Asian | 24373.0 | -1.273980e-16 | 1.000021 | -2.780694 | -0.191703 | 0.144517 | 0.144517 | 5.544391 |
| Black | 6447.0 | -3.526814e-16 | 1.000078 | -2.814527 | -0.433156 | 0.179557 | 0.179557 | 4.689407 |
| Other | 16964.0 | -9.005346e-18 | 1.000029 | -3.086510 | -0.267739 | 0.181157 | 0.181157 | 5.764106 |
| White | 87048.0 | 1.935365e-16 | 1.000006 | -2.728541 | -0.284411 | 0.109967 | 0.109967 | 5.059046 |
clf_aware = Pipeline(steps=[
('normalize_priors', NormalizeColumnByLabel(col='WKHP',label='RACE')),
('preprocessor',preprocessor),
('classifier',classifier)
])
clf_aware.fit(X_train_fair,y_train_fair)
aware_y_preds = clf_aware.predict(X_test_fair)
exp_aware = dx.Explainer(clf_aware, X_test_fair,y_test_fair, label='Random Forest DIR', verbose=False)
mf_aware=exp_aware.model_fairness(protected=race_test, privileged="White")
pd.concat([exp1.model_performance().result for exp1 in [exp,exp_aware]])
| recall | precision | f1 | accuracy | auc | |
|---|---|---|---|---|---|
| Random Forest Bias Unaware | 0.766958 | 0.819584 | 0.792398 | 0.809131 | 0.889062 |
| Random Forest DIR | 0.760899 | 0.821277 | 0.789936 | 0.807796 | 0.889296 |
fair_tree.plot(objects=[mf_aware],type='stacked')
from aif360.algorithms.preprocessing.lfr import LFR
from aif360.datasets import BinaryLabelDataset
class LFRCustom(BaseEstimator, TransformerMixin):
def __init__(self,col,protected_col,unprivileged_groups,privileged_groups):
self.col = col
self.protected_col= protected_col
self.TR = None
self.unprivileged_groups = unprivileged_groups
self.privileged_grous = privileged_groups
def fit(self,X,y=None):
d = pd.DataFrame(X, columns = self.col)
d['response'] = list(y)
binary_df = BinaryLabelDataset(
df=d,
protected_attribute_names = self.protected_col,
label_names=['response']
)
self.TR = LFR(unprivileged_groups=self.unprivileged_groups,
privileged_groups = self.privileged_groups, seed=0,
k=2, Ax=0.5, Az=0.2,
verbose=1)
self.TR.fit(binary_df,maxiter=5000,maxfun=5000)
return self
def transform(self,X, y=None):
d = pd.DataFrame(X, columns=self.col)
if y:
d['response'] = list(y)
else:
d['response'] = False
binary_df=BinaryLabelDataset(
df=d,
protected_attribute_names = self.protected_col,
label_names=['response']
)
return self.TR.transform(binary_df).convert_to_dataframe()[0].drop(['response'], axis=1)
plottable = plottable[plottable['Salary_below_median'].notna()]
X = plottable.drop(columns=['SALARY','Salary_below_median'], axis=1)
y = plottable.Salary_below_median
categorical_features = ['SEX', 'COW', 'MAR', 'RACE', "SCHL"]
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
('cat', categorical_transformer, categorical_features)
])
clf = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', DecisionTreeClassifier(max_depth=7, random_state=123))
])
y=y.astype('int')
clf.fit(X, y)
exp = dx.Explainer(clf, X, y)
Preparation of a new explainer is initiated -> data : 168541 rows 9 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 168541 values -> model_class : sklearn.tree._classes.DecisionTreeClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x00000293D51C3370> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.0, mean = 0.47, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.957, mean = -5.48e-18, max = 0.875 -> model_info : package sklearn A new explainer has been created!
protected = plottable.SEX + '_' + np.where(plottable.AGE < 35, 'young', 'old')
privileged = 'Male_old'
fobject = exp.model_fairness(protected = protected, privileged=privileged)
fobject.fairness_check(epsilon = 0.8)
Bias detected in 4 metrics: TPR, PPV, FPR, STP
Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.
Ratios of metrics, based on 'Male_old'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
TPR ACC PPV FPR STP
Female_old 1.178330 0.925414 1.210909 1.352564 1.446281
Female_young 1.629797 1.020718 1.585455 1.442308 2.326446
Male_young 1.708804 1.029006 1.489091 1.762821 2.376033
protected1 = plottable.RACE + '_' + np.where(plottable.SEX == 'Male', 'Male', 'Female')
privileged1 = 'White_Male'
fobject = exp.model_fairness(protected = protected1, privileged=privileged1)
protected2 = plottable.RACE
privileged2 = 'White'
fobject = exp.model_fairness(protected = protected2, privileged=privileged2)
fobject.fairness_check(epsilon = 0.8)
Bias detected in 3 metrics: TPR, FPR, STP
Conclusion: your model is not fair because 2 or more criteria exceeded acceptable limits set by epsilon.
Ratios of metrics, based on 'White'. Parameter 'epsilon' was set to 0.8 and therefore metrics should be within (0.8, 1.25)
TPR ACC PPV FPR STP
Asian 0.885965 1.021067 0.994543 0.736842 0.792023
Black 1.136842 0.950843 0.969986 1.695906 1.361823
Other 1.489474 1.008427 1.017735 3.035088 2.082621
# https://medium.com/responsibleml/how-to-easily-check-if-your-ml-model-is-fair-2c173419ae4c
# https://www.kdnuggets.com/2020/12/machine-learning-model-fair.html
# https://freecontent.manning.com/bias-and-fairness-in-machine-learning-part-3-building-a-bias-aware-model/
# https://freecontent.manning.com/bias-and-fairness-in-machine-learning-part-2-building-a-baseline-model-and-features/
train1,test1=plottable1.drop(['SALARY','Salary_below_median'],axis=1), plottable1['Salary_below_median']
test1=test1.astype('int')
X_train1, X_test1, y_train1, y_test1 = train_test_split(train1, test1, test_size=0.2, random_state=np.random.seed())
clf_1 = SGDClassifier()
# fit (train) the classifier
clf_1.fit(X_train1, y_train1)
clf_pred=clf_1.predict(X_test1)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train1, y_train1)
rfc_pred = rfc.predict(X_test1)
gbc = GradientBoostingClassifier(n_estimators=100)
gbc.fit(X_train1, y_train1)
gbc_pred = gbc.predict(X_test1)
clf_acc = accuracy_score(y_test1, clf_pred)
print("Linear Accuracy: " + str(clf_acc))
rfc_acc = accuracy_score(y_test1, rfc_pred)
print("Random Forest Accuracy: " + str(rfc_acc))
gbc_acc = accuracy_score(y_test1, gbc_pred)
print("Gradient Boosting Accuracy: " + str(gbc_acc))
Linear Accuracy: 0.5551039781660684 Random Forest Accuracy: 0.7976208134326145 Gradient Boosting Accuracy: 0.8128393010768638
X,y=plottable.drop(['SALARY','Salary_below_median'],axis=1), plottable['Salary_below_median']
y=y.astype('int')
numeric_features = ['AGE', 'OCCP', 'WKHP','HISP']
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))])
categorical_features = ['SEX', 'COW', 'MAR', 'RACE', "SCHL"]
preprocessor = ColumnTransformer(
transformers=[
('cat', categorical_transformer, categorical_features),
('num', numeric_transformer, numeric_features)])
sgd_class = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', SGDClassifier(random_state=np.random.seed()))]).fit(X,y)
clf_forest = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=np.random.seed(), max_depth=4))]).fit(X,y)
clf_gboost = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier(random_state=np.random.seed()))]).fit(X,y)
clf_xgboost = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', xg.XGBClassifier(random_state=np.random.seed()))]).fit(X,y)
# create Explainer objects
exp_forest = dx.Explainer(clf_forest, X,y, verbose = False)
exp_xgboost = dx.Explainer(clf_xgboost, X,y, verbose = False)
exp_sgd = dx.Explainer(sgd_class, X,y, verbose=False)
exp_gboost = dx.Explainer(clf_gboost, X,y, verbose = False)
# create fairness explanations
fobject_forest = exp_forest.model_fairness(protected2, privileged2)
fobject_xgboost = exp_xgboost.model_fairness(protected2, privileged2)
fobject_sgd = exp_sgd.model_fairness(protected2, privileged2)
fobject_gboost = exp_gboost.model_fairness(protected2, privileged2)
# lets see their metric scores
fobject.plot(objects=[fobject_xgboost, fobject_forest, fobject_gboost,fobject_sgd]) #, type = "metric_scores")
fobject.plot(objects=[fobject_forest, fobject_xgboost,fobject_sgd], type = "metric_scores")
protected3 = plottable.SEX + '_' + np.where(plottable.HISP == 1, 'Non-Hispanic', 'Hispanic')
privileged3 = 'Male_Non-Hispanic'
fobject3 = exp.model_fairness(protected = protected3, privileged=privileged3)
# create fairness explanations
fobject_forest3 = exp_forest.model_fairness(protected3, privileged3)
fobject_xgboost3 = exp_xgboost.model_fairness(protected3, privileged3)
fobject_sgd3 = exp_sgd.model_fairness(protected3, privileged3)
fobject_gboost3 = exp_gboost.model_fairness(protected3, privileged3)
# lets see their metric scores
fobject3.plot(objects=[fobject_gboost3,fobject_forest3, fobject_xgboost3,fobject_sgd3]) #, type = "metric_scores")
protected4 = np.where(plottable.HISP == 1, 'Non-Hispanic', 'Hispanic')
privileged4 = 'Non-Hispanic'
fobject4 = exp.model_fairness(protected = protected4, privileged=privileged4)
# create fairness explanations
fobject_forest4 = exp_forest.model_fairness(protected4, privileged4)
fobject_xgboost4 = exp_xgboost.model_fairness(protected4, privileged4)
fobject_sgd4 = exp_sgd.model_fairness(protected4, privileged4)
fobject_gboost4 = exp_gboost.model_fairness(protected4, privileged4)
# lets see their metric scores
fobject4.plot(objects=[fobject_gboost4,fobject_forest4, fobject_xgboost4,fobject_sgd4])
protected5 = plottable.SEX
privileged5 = 'Male'
fobject5 = exp.model_fairness(protected = protected5, privileged=privileged5)
# create fairness explanations
fobject_forest5 = exp_forest.model_fairness(protected5, privileged5)
fobject_xgboost5 = exp_xgboost.model_fairness(protected5, privileged5)
fobject_sgd5 = exp_sgd.model_fairness(protected5, privileged5)
fobject_gboost5 = exp_gboost.model_fairness(protected5, privileged5)
# lets see their metric scores
fobject5.plot(objects=[fobject_gboost5,fobject_forest5,fobject_xgboost5,fobject_sgd5])